library(data.table)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(graphics)
library(grDevices)
library(readr)
library(stats)
library(utils)
Monthly_USD_Reserves_million_ <- read_csv("~/Desktop/University/Junior/Semester 2/IE360/HW1/Monthly USD Reserves (million).csv",
col_types = cols(Date = col_date(format = "%Y-%m"),
`Bank USD Assets (Million)` = col_number()))
View(Monthly_USD_Reserves_million_)
Building_Sold_Turkey <- read_delim("Building Sold Turkey.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
Buildings = col_number()), trim_ws = TRUE)
View(Building_Sold_Turkey)
Confidence_Index <- read_delim("Confidence Index.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
Confidence = col_number()), trim_ws = TRUE)
View(Confidence_Index)
Foreign_Loan_USD_Million <- read_delim("Foreign Loan USD Million.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
Loan = col_number()), trim_ws = TRUE)
View(Foreign_Loan_USD_Million)
Employement_Rates <- read_csv("Employement Rates.csv",
col_types = cols(Date = col_date(format = "%Y-%m"),
`Employement Rate` = col_number()))
View(Employement_Rates)
MA_Deposit_Interest_Rate <- read_delim("MA Deposit Interest Rate.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
`Deposit Interest Rate` = col_number()),
trim_ws = TRUE)
View(MA_Deposit_Interest_Rate)
Traded_Stocks <- read_delim("Traded Stocks.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
`Traded Stocks (Thousand)` = col_number()),
trim_ws = TRUE)
View(Traded_Stocks)
Gu_mru_k <- read_csv("Gümrük.csv", col_types = cols(Week = col_date(format = "%Y-%m-%d"),
`gümrük: (Türkiye)` = col_number()),
skip = 1)
View(Gu_mru_k)
House_for_Sale <- read_csv("House for Sale.csv",
col_types = cols(Week = col_date(format = "%Y-%m-%d"),
`satılık ev: (Türkiye)` = col_number()),
skip = 1)
View(House_for_Sale)
Kira <- read_csv("Kira.csv", col_types = cols(Week = col_date(format = "%Y-%m-%d"),
`kira: (Türkiye)` = col_number()), skip = 1)
View(Kira)
Housing_Loan_Interest <- read_delim("Housing Loan Interest.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
`Interest on Housing Loans` = col_number()),
trim_ws = TRUE)
View(Housing_Loan_Interest)
Monthly_USD_Exchange_Rates <- read_csv("Monthly USD Exchange Rates.csv",
col_types = cols(Date = col_date(format = "%Y-%m"),
`Exchange Rate` = col_number()))
View(Monthly_USD_Exchange_Rates)
TUFE_data <- read_delim("TUFE_data.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
TUFE = col_number()), trim_ws = TRUE)
View(TUFE_data)
CD_Card_Expenditure <- read_delim("CD Card Expenditure.csv",
delim = ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y-%m"),
`Expenditures (Thousand TL)` = col_number()),
trim_ws = TRUE)
View(CD_Card_Expenditure)
Zam <- read_csv("Zam.csv", col_types = cols(Week = col_date(format = "%Y-%m-%d"),
`zam: (Türkiye)` = col_number()), skip = 1)
View(Zam)
In this homework, we were asked to use data from the Central Bank of the Republic of Turkey (CBRT) and gather 3 distinct categories of time series. These time series were required to have a correlation of less than 0.5 and also be observed at most in monthly intervals.
The 3 time series we choose later linearized through addition of data obtained from Google Trends and CBRT. In the end, 3 time series regressions were asked to be conducted as part of this homework.
All of the data collected and observed are within the time frame of Jan. 2020 and Jan. 2023, which is a 3 year of an interval.
The first step was to choose 3 time series that had a correlation of less than 0.5. First data I chose was the “Official Reserve Assets (Million USD Dollars)” from the “International Reserves and Foreign Exchange Liquidity” category. Following are the data and the plot of the corresponding time series.
Monthly_USD_Reserves <- data.table(Dates = as.Date(Monthly_USD_Reserves_million_$Date), Reserves = as.numeric(Monthly_USD_Reserves_million_$`Bank USD Assets (Million)`))
Monthly_USD_Reserves
str(Monthly_USD_Reserves_million_)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Bank USD Assets (Million): num [1:37] 102467 107724 92145 86343 90914 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Bank USD Assets (Million)` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Monthly_USD_Reserves_million_$Reserves <- ts(Monthly_USD_Reserves_million_[,2], start = 2020, frequency = 12)
ggplot(Monthly_USD_Reserves_million_, aes(x = Date, y = Reserves)) + xlab("Date") + ylab("Reserve Assets (million USD)") + ggtitle("Monthly USD Reserve Assets (million)") + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
The second data I chose to work with was the “House Sale Statistics_Turkey_Total sales (piece)” from the “House and Construction Statistics” category. Following are the data and the plot.
Houses_Sold_Turkey <- data.table(Dates = as.Date(Building_Sold_Turkey$Date), House = as.numeric(Building_Sold_Turkey$Buildings))
Houses_Sold_Turkey
str(Building_Sold_Turkey)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Buildings: num [1:37] 113615 118753 108670 42783 50936 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. Buildings = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Building_Sold_Turkey$Houses <- ts(Building_Sold_Turkey[,2], start = 2020, frequency = 12)
ggplot(Building_Sold_Turkey, aes(x = Date, y = Buildings)) + xlab("Date") + ylab("Houses Sold (Piece)") + ggtitle("Monthly Total Houses Sold in Turkey") + geom_line()
The final data I selected to work was the “Consumer Confidence Index” from the “Consumer Tendency Survey” category. Data and the plot is given below.
Confidence_Indexes <- data.table(Dates = as.Date(Confidence_Index$Date), Conf_Ind = as.numeric(Confidence_Index$Confidence))
Confidence_Indexes
str(Confidence_Index)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Confidence: num [1:37] 81.1 79.3 81 78.2 82.9 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. Confidence = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Confidence_Index$confidence_ind <- ts(Confidence_Index[,2], start = 2020, frequency = 12)
ggplot(Confidence_Index, aes(x = Date, y = confidence_ind)) + xlab("Date") + ylab("Confidence Index (%)") + ggtitle("Monthly Consumer Confidence Index") + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Since the requirement of plots not having a correlation greater than 0.5 is asked, next the correlation between the data are checked to see if they fit this requirement.
corr_check <- data.frame(Monthly_USD_Reserves_million_$Reserves,Building_Sold_Turkey$Buildings, Confidence_Index$confidence_ind)
corr_check
ggpairs(corr_check)
As seen on the correlation chart, no two pairs of the data chosen have a correlation greater than 0.5. This means that we can move on with working individually with these data to create time series regression models.
While doing the regression, one of the most important factor to choose the independant variables is to find relevant series that’ll be helpful to explain why the original data is acting the way it is. Thus, relevant data should be chosen to add to the linear model. The datas I used for this model are Foreign USD Loans, Employment Rates, Total Interest for Deposits opened in USD, Amount of Stocks Traded and Google Trends data for “Gümrük” to see if they have an effect on the USD Reserve assets.
As for the factors that make up the assets in the first place, loans play a big role to come up with that value since it contributes to the reserves and is a prominent factor in the asset calculation. Employment rates would be in the calculations of assets as well since it is considered as the human capital, so there should be a relation with the USD reserve assets as well. Total Interest for deposits opened in USD can also be a valuable data that may have a relation as interests encourage or discourage people to deposit their money and effect the reserves directly. Amount of stocks traded will be bringing cash to the reserves as stocks are bought with money so it may have a relation with the USD reserve assets. Finally customs may have a relation with the reserves in encouraging foreigners to invest in Turkey or not since their investments will be passing through the customs.
The first data I checked was the foreign USD loans, data and the plot of the foreign USD loans is as follows:
Foreign_Loan <- data.table(Dates = as.Date(Monthly_USD_Reserves_million_$Date), For_Loan = as.numeric(Foreign_Loan_USD_Million$Loan))
Foreign_Loan
str(Foreign_Loan_USD_Million)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date: Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Loan: num [1:37] 96502 96165 92610 90664 99180 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. Loan = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Foreign_Loan_USD_Million$Loans <- ts(Foreign_Loan_USD_Million[,2], start = 2020, frequency = 12)
ggplot(Foreign_Loan_USD_Million, aes(x = Date, y = Loans)) + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Now a new series is created so that the correlations can be checked to see if our data is relevant. Also from this series, a linear model can be created later on.
series_1 <- data.table(Dates = as.Date(Monthly_USD_Reserves_million_$Date), Reserves = as.numeric(Monthly_USD_Reserves_million_$`Bank USD Assets (Million)`), Loans = Foreign_Loan_USD_Million$Loan)
head(series_1,12)
ggpairs(series_1)
We see that there is a strong correlation between reserves and loans, meaning that it will contribute good to the linear model. Now, the linear model is built with reserves as our dependent and loans as our independent variable. Then, the residual is checked for this model for the R^2 and fit to the normal distribution.
lm_1 <- lm(Reserves ~ Loans, series_1 )
summary(lm_1)
##
## Call:
## lm(formula = Reserves ~ Loans, data = series_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14929 -8454 -1971 5412 23706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.195e+04 1.283e+04 2.490 0.0177 *
## Loans 5.915e-01 1.070e-01 5.526 3.25e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10300 on 35 degrees of freedom
## Multiple R-squared: 0.466, Adjusted R-squared: 0.4507
## F-statistic: 30.54 on 1 and 35 DF, p-value: 3.254e-06
checkresiduals(lm_1$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 55.071, df = 7, p-value = 1.443e-09
##
## Model df: 0. Total lags used: 7
As mentioned earlier, we see that loans are a factor that contributes to the reserve assets, which we can deduce from the positive coefficient loans took. This positive coefficient indicates that increasing the loans will also increase the reserves in that time period.
Next addition is the employment rates. The data and the plot for employement rate is as follows:
Employ_Rate <- data.table(Dates = as.Date(Monthly_USD_Reserves_million_$Date), Empl_R = as.numeric(Employement_Rates$`Employement Rate`))
Employ_Rate
str(Employement_Rates)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Employement Rate: num [1:37] 43.5 43.6 41.9 40.3 41 43 43 43.8 44 43.6 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Employement Rate` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Employement_Rates$Rates <- ts(Employement_Rates[,2], start = 2020, frequency = 12)
ggplot(Employement_Rates, aes(x = Date, y = Rates)) + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
The correlation with employement rate added is checked:
series_1$Employement_Rate <- Employement_Rates[,2]
head(series_1,12)
ggpairs(series_1)
Once again a strong correlation between reserve assets and newly added series of employment. rate can be seen. Now the same process for linear model is repeated.
lm_1 <- lm(Reserves ~ Loans + Employement_Rate, series_1 )
summary(lm_1)
##
## Call:
## lm(formula = Reserves ~ Loans + Employement_Rate, data = series_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17469 -8579 1026 6269 19350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.674e+04 4.475e+04 -1.491 0.1451
## Loans 1.448e-01 2.197e-01 0.659 0.5143
## Employement_Rate 3.359e+03 1.466e+03 2.291 0.0283 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9723 on 34 degrees of freedom
## Multiple R-squared: 0.5374, Adjusted R-squared: 0.5102
## F-statistic: 19.75 on 2 and 34 DF, p-value: 2.036e-06
checkresiduals(lm_1$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 49.985, df = 7, p-value = 1.454e-08
##
## Model df: 0. Total lags used: 7
The R^2 is increased by 0.06, which is a good sign as we want R^2 to eventually reach or come as close as possible to 1. As predicted, employment rate added positively to the reserves as it took a positive coefficient in the model. This means that more employment will lead to more reserve assets in the corresponding time period. The third data is the total interest rates on deposits opened in USD. The data and the plot for the interest rates for deposits are as follows:
Deposit_Interest <- data.table(Dates = as.Date(Monthly_USD_Reserves_million_$Date), Dep_Int = as.numeric(MA_Deposit_Interest_Rate$`Deposit Interest Rate`))
Deposit_Interest
str(MA_Deposit_Interest_Rate)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Deposit Interest Rate: num [1:37] 1.37 1.17 0.96 1.1 0.75 0.76 0.77 0.98 1.24 1.33 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Deposit Interest Rate` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
MA_Deposit_Interest_Rate$Interest <- ts(MA_Deposit_Interest_Rate[,2], start = 2020, frequency = 12)
ggplot(MA_Deposit_Interest_Rate, aes(x = Date, y = Interest)) + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Now we check for the correlation to see if it is indeed correlated to the reserves.
series_1$Interest <- MA_Deposit_Interest_Rate[,2]
head(series_1,12)
ggpairs(series_1)
Unfortunately, we see that the correlation between the reserves and the total interest for deposits aren’t strong as thought initially. Later, we check how it effects our linear model.
lm_1 <- lm(Reserves ~ Loans + Employement_Rate + Interest, series_1 )
summary(lm_1)
##
## Call:
## lm(formula = Reserves ~ Loans + Employement_Rate + Interest,
## data = series_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16613.1 -6745.9 311.6 6727.2 16990.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.613e+04 4.338e+04 -1.525 0.1369
## Loans 3.339e-01 2.377e-01 1.405 0.1695
## Employement_Rate 2.999e+03 1.435e+03 2.089 0.0445 *
## Interest -5.122e+03 2.866e+03 -1.787 0.0830 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9424 on 33 degrees of freedom
## Multiple R-squared: 0.5782, Adjusted R-squared: 0.5399
## F-statistic: 15.08 on 3 and 33 DF, p-value: 2.37e-06
checkresiduals(lm_1$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 53.501, df = 7, p-value = 2.952e-09
##
## Model df: 0. Total lags used: 7
When looked to the residual, we see that addition of total interest for deposits increased the R^2 by 0.06, which means that it helped in the explanation of the reserves series. From the negative coefficient resulted from the model for the total interest for deposits, we can deduce that increase of the interest will decrease the reserves. Fourth data investigated is the total amount of stocks traded. The data and the plot is as follows:
Stocks_Traded <- data.table(Dates = as.Date(Monthly_USD_Reserves_million_$Date), Stocks = as.numeric(Traded_Stocks$`Traded Stocks (Thousand)`))
Stocks_Traded
str(Traded_Stocks)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Traded Stocks (Thousand): num [1:37] 4272819 3342049 3898605 4857854 5382791 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Traded Stocks (Thousand)` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Traded_Stocks$Stocks <- ts(Traded_Stocks[,2], start = 2020, frequency = 12)
ggplot(Traded_Stocks, aes(x = Date, y = Stocks)) + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Next we check the correlations:
series_1$Stocks <- Traded_Stocks[,2]
head(series_1,12)
ggpairs(series_1)
Unfortunately the correlation once again isn’t strong.
lm_1 <- lm(Reserves ~ Loans + Employement_Rate + Interest + Stocks, series_1 )
summary(lm_1)
##
## Call:
## lm(formula = Reserves ~ Loans + Employement_Rate + Interest +
## Stocks, data = series_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17078.5 -5037.6 939.4 5998.3 17825.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.221e+04 4.535e+04 -1.151 0.2582
## Loans 3.438e-01 2.376e-01 1.447 0.1577
## Employement_Rate 2.724e+03 1.458e+03 1.868 0.0709 .
## Interest -3.477e+03 3.270e+03 -1.063 0.2957
## Stocks -9.391e-04 9.032e-04 -1.040 0.3062
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9412 on 32 degrees of freedom
## Multiple R-squared: 0.592, Adjusted R-squared: 0.541
## F-statistic: 11.61 on 4 and 32 DF, p-value: 6.174e-06
checkresiduals(lm_1$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 52.093, df = 7, p-value = 5.597e-09
##
## Model df: 0. Total lags used: 7
With the weak correlation, the impact on the R^2 is also not in a significant amount but still able to increase the R^2 to be able to explain the series. From this result, we can see that increase of the stock transactions lead to a decrease at the reserve amounts
Another seasonal factor that we can look into is the the effect of pandemic. After 2021 April, Turkey entered to the gradual normalization period which effected the trades and also the USD reserve asset amount. By adding a categorical variable by giving 1 to the period before the gradual normalization, we can check for the effects of the pandemic. Meanwhile, we can also check to see if trend plays a factor in the series.
First, we add the trend.
series_1$trend <- 1:dim(series_1)[1]
months <- 1:12
series_1 <- cbind(series_1,months)
## Warning: Item 2 has 12 rows but longest item has 37; recycled with remainder.
Now we give the condition for to select the period of pandemics.
series_1$before_gradual_normalization <- 0
condition <- series_1$trend < 17
series_1$before_gradual_normalization[condition] = 1
series_1
We can later check for the correlation.
ggpairs(series_1)
Now we add the trend and pandemic variables to our linear model.
lm_1 <- lm(Reserves ~ Loans + Employement_Rate + Interest + Stocks + trend + before_gradual_normalization, series_1 )
summary(lm_1)
##
## Call:
## lm(formula = Reserves ~ Loans + Employement_Rate + Interest +
## Stocks + trend + before_gradual_normalization, data = series_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15099 -6256 1029 5217 18259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.271e+04 8.237e+04 -0.397 0.694
## Loans 6.688e-01 5.027e-01 1.331 0.193
## Employement_Rate 1.847e+03 1.709e+03 1.080 0.289
## Interest -1.629e+03 3.627e+03 -0.449 0.656
## Stocks -4.888e-04 9.416e-04 -0.519 0.607
## trend -9.195e+02 8.276e+02 -1.111 0.275
## before_gradual_normalization -1.346e+04 8.704e+03 -1.546 0.133
##
## Residual standard error: 9323 on 30 degrees of freedom
## Multiple R-squared: 0.6247, Adjusted R-squared: 0.5496
## F-statistic: 8.323 on 6 and 30 DF, p-value: 2.361e-05
checkresiduals(lm_1$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 62.06, df = 7, p-value = 5.848e-11
##
## Model df: 0. Total lags used: 7
We can see that addition of these two variables in fact increased our R^2 by 0.009, so a part of our series was related to the pandemic and the trend. Pandemic period affected the series negatively by checking the negative coefficient assigned to it. Similarly, the trend is negatively impact to our series.
Last data added to the model is the information about the customs. For this data, we use the Google Trends search values for “Gümrük” since the search rate for customs may also give an insight on the amount of investors interested in Turkey.
Since this is a weekly Google Trends data that we will be working on, we can start by aggregating the data and adding it to the series. The original data and the plot for the aggregated data are shown below.
head(Gu_mru_k,18)
ggplot(Gu_mru_k, aes(x = Gu_mru_k$Week, y = Gu_mru_k$`gümrük: (Türkiye)`)) + geom_line()
As this is a weekly data, we can aggregate it ot a monthly data so that we can work with it in our model. The plot and data for the monthly searches is as follows:
bymonth <- aggregate(Gu_mru_k$`gümrük: (Türkiye)`~ month(Week) + year(Week), data = Gu_mru_k, FUN = sum)
bymonth
series_1$Gumruk_search <- bymonth[2:38,3]
series_1
ggplot(series_1, aes(x = Dates, y = Gumruk_search)) + geom_line()
Next, we can check for the correlation with our monthly aggregated data added.
ggpairs(series_1)
From the correlations, we can see that the correlation between search of customs and the USD Reserve Assets weren’t closely correlated. We can later on add our variable to our model.
lm_1 <- lm(Reserves ~ Loans + Employement_Rate + Interest + Stocks + trend + before_gradual_normalization + Gumruk_search, series_1 )
summary(lm_1)
##
## Call:
## lm(formula = Reserves ~ Loans + Employement_Rate + Interest +
## Stocks + trend + before_gradual_normalization + Gumruk_search,
## data = series_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15879.3 -5864.1 821.8 5069.4 16407.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.873e+04 8.240e+04 -0.470 0.642
## Loans 6.385e-01 5.024e-01 1.271 0.214
## Employement_Rate 2.303e+03 1.759e+03 1.309 0.201
## Interest -1.909e+03 3.629e+03 -0.526 0.603
## Stocks -2.542e-04 9.653e-04 -0.263 0.794
## trend -1.019e+03 8.312e+02 -1.226 0.230
## before_gradual_normalization -1.407e+04 8.704e+03 -1.616 0.117
## Gumruk_search -3.383e+01 3.188e+01 -1.061 0.297
##
## Residual standard error: 9304 on 29 degrees of freedom
## Multiple R-squared: 0.6387, Adjusted R-squared: 0.5515
## F-statistic: 7.325 on 7 and 29 DF, p-value: 4.429e-05
checkresiduals(lm_1$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 55.984, df = 7, p-value = 9.512e-10
##
## Model df: 0. Total lags used: 7
The final addition to our model improved our R^2 by 0.005, which is a small increase but still an improvement on our model. Checking by the coefficient once again, the increase in the searches for “Gumruk” decreased the USD Reserve Assets, which is meaningful as customs being searched more may be because of some concerns about them and lead to decreasing inflow of USD assets.
Checking the residuals for our final model, unfortunately looking at the ACF we can see that there is still some more correlation between the residual values. Some of the correlation were outside of the borders we want to stay within. This shows us that there may still be some variable missing in our model and we need to look further to see what can be related to USD Reserve Assets.
Checking the graph, the residuals seem to go beyond the normal distribution curve we want to fit inside, but till there is a resemblance to the aimed curve.
By the final R^2 = 0.5515 value we obtained, we can say that the model we built was only able to explain 55.15% of the actual data that was observed. It is still a significant ratio but not enough to say that it was a strong fit of the actual values given.
Lastly for this series, we can compare the actual reserve values with our linear model and the residuals. For this, we add these two to our series data frame and then plot these to see if they are correlated or not.
series_1$model <- fitted(lm_1)
series_1$residual <- residuals(lm_1)
series_1
ggplot(series_1, aes(x = model, y = residual)) + geom_point()
ggplot(series_1, aes(x = Reserves, y = model)) + geom_point() + geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
When we look at the residual versus the model we built, we can see that there are certain model values residuals are gathered around. This may indicate that there is still a correlation between the residuals and the model that we made.
After looking at our model and the reserves, we see that a correlation between our model and the actual values of the reserves. There is still a variation between those two and values that are significantly away from creating a correlation, but they can be improved upon further additions of variables to the model.
Lastly for this model, let’s see how well they fit on a time series graph.
ggplot(series_1, aes(x=Dates)) +
geom_line(aes(y = Reserves, color = "orange")) +
geom_line(aes(y = model, color = "blue"))
From these two, we can see that our model captured some of the aspects of the actual graph of reserve assets, but it is has some major lacking patterns such as the drops in the middle of 2020 and at the beginning of the 2022. But overall it may represent a general idea of how the USD Reserve assets changed in this period.
Starting for the second series, which is the total Houses sold, let’s start with bringing the original data and the plot.
Houses_Sold_Turkey
ggplot(Building_Sold_Turkey, aes(x = Date, y = Houses)) + xlab("Date") + ylab("Houses Sold (Piece)") + ggtitle("Monthly Total Houses Sold in Turkey") + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
The datas I used for this model are Employment Rate, Housing Loan Interest Applied by Banks and Google Trends datas for “Satılık Ev” and “Kira”.
The amount of notices for houses on sale can be directly related to the amount of houses that are sold during this period, as the supply in the market would be higher. Employment rates may also be related to the houses sold as more employees with a steady income can affect the demand for the houses. The search rates for rents can also be related as it gives an insight on the people who are looking for the substitute of houses on sale and perhaps it shows amount of people interested in making decisions about moving. Lastly the housing loan interests applied by the banks can be strongly related on the decisions of people to buy a house or not, and have an effect on the subject we are investigating.
We can initiate the series data frame we will be using for this time series and check for autocorrelation to see if the spikes seen in the graph is seasonal or not.
series_2 <- data.table(Dates = as.Date(Building_Sold_Turkey$Date), Houses = as.numeric(Building_Sold_Turkey$Houses))
head(series_2,12)
acf(series_2$Houses,37)
From the autocorrelation factor, we can see that there is no certain certain trend involved as the acf changes without a slope, but a pattern of wave can be seen which has several peaks through the lags. We can check with seasonality variables to see if this wave pattern is related with the time of year.
series_2$trend <- 1:dim(series_2)[1]
months <- 1:12
series_2 <- cbind(series_2,months)
## Warning: Item 2 has 12 rows but longest item has 37; recycled with remainder.
series_2
ggpairs(series_2)
From the correlations, we can see that there is a stronger connection between the houses sold and the months rather than the trend we were also looking to add to our model.
lm_2 <- lm(Houses ~ months, series_2 )
summary(lm_2)
##
## Call:
## lm(formula = Houses ~ months, data = series_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66834 -25814 -106 21884 101855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85769 12961 6.617 1.2e-07 ***
## months 5962 1785 3.340 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38220 on 35 degrees of freedom
## Multiple R-squared: 0.2417, Adjusted R-squared: 0.22
## F-statistic: 11.16 on 1 and 35 DF, p-value: 0.001999
checkresiduals(lm_2$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 9.7948, df = 7, p-value = 0.2005
##
## Model df: 0. Total lags used: 7
Adding the months as a variable created a model that could be useful as it initiated the R^2.
Since our variable of months is numbered from 1 to 12 and have a positive coefficient, we can say that within a year towards the end more houses are sold in Turkey.
First variable we are adding is the search rates for the “Satılık Ev”. Data and the plot is as follows for the searches:
House_Sale <- data.table(Dates = as.Date(House_for_Sale$Week), House_search = as.numeric(House_for_Sale$`satılık ev: (Türkiye)`))
head(House_Sale,18)
House_for_Sale$Searches <- ts(House_for_Sale[,2], start = 2020, frequency = 12)
ggplot(House_for_Sale, aes(x = House_for_Sale$Week, y = House_for_Sale$`satılık ev: (Türkiye)`)) + geom_line()
As this data is given as weekly, we need to aggregate it into monthly data and look into the graph again.
Search_bymonth <- aggregate(House_for_Sale$Searches~ month(Week) + year(Week), data = House_for_Sale, FUN = sum)
Search_bymonth
series_2$House_for_Sale <- Search_bymonth[2:38,3]
ggplot(series_2, aes(x = Dates, y = House_for_Sale)) + geom_line()
Now that we have the monthly aggregated data for searches, we can check for the correlation.
ggpairs(series_2)
We see from the correlation chart that the house search amounts aren’t as strongly correlated as hoped. We can check the impact of the searches to our linear model nonetheless.
lm_2 <- lm(Houses ~ months + House_for_Sale, series_2 )
summary(lm_2)
##
## Call:
## lm(formula = Houses ~ months + House_for_Sale, data = series_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80884 -23351 -1647 24579 91173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65543.2 23485.1 2.791 0.00856 **
## months 5944.3 1783.3 3.333 0.00208 **
## House_for_Sale 117.5 113.9 1.032 0.30921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38180 on 34 degrees of freedom
## Multiple R-squared: 0.2648, Adjusted R-squared: 0.2215
## F-statistic: 6.122 on 2 and 34 DF, p-value: 0.005362
checkresiduals(lm_2$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 8.5081, df = 7, p-value = 0.2899
##
## Model df: 0. Total lags used: 7
Looking at the R^2 value, addition of this data didn’t have a major effect on it as there is only an increase of 0.0015.
We can say that with the positive coefficient of the House for Sale searches, increase in searches increases the houses sold in Turkey. Which is logical since more searches mean that either more people are buying houses or putting on sale. If we inspect the residual analysis part, we can see that ACF is within the aimed range meaning there isn’t much of a correlation between the residuals. We can also look at the graph of residuals to see that it resembles a normal distribution, adding to the indications of the white noise series we’ve found.
The next variable I added is the employment rates. The data and the plot for employment rate is as follows:
Employ_Rate <- data.table(Dates = as.Date(Building_Sold_Turkey$Date), Empl_R = as.numeric(Employement_Rates$`Employement Rate`))
Employ_Rate
str(Employement_Rates)
## spc_tbl_ [37 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Employement Rate: num [1:37] 43.5 43.6 41.9 40.3 41 43 43 43.8 44 43.6 ...
## $ Rates : Time-Series [1:37, 1] from 2020 to 2023: 43.5 43.6 41.9 40.3 41 43 43 43.8 44 43.6 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr "Employement Rate"
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Employement Rate` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Employement_Rates$Rates <- ts(Employement_Rates[,2], start = 2020, frequency = 12)
ggplot(Employement_Rates, aes(x = Date, y = Rates)) + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Now, let’s check for the correlation.
series_2$Employment_Rate <- Employement_Rates[,2]
series_2
ggpairs(series_2)
The correlation unfortunately didn’t yield a strong one. Let’s add this variable to our model and see it’s effect.
lm_2 <- lm(Houses ~ months + House_for_Sale + Employment_Rate, series_2 )
summary(lm_2)
##
## Call:
## lm(formula = Houses ~ months + House_for_Sale + Employment_Rate,
## data = series_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72358 -20016 -4643 22090 95621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -133242.7 136047.0 -0.979 0.33451
## months 5111.3 1840.6 2.777 0.00897 **
## House_for_Sale 177.7 119.0 1.493 0.14496
## Employment_Rate 4285.6 2890.5 1.483 0.14766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37530 on 33 degrees of freedom
## Multiple R-squared: 0.3107, Adjusted R-squared: 0.248
## F-statistic: 4.958 on 3 and 33 DF, p-value: 0.005975
checkresiduals(lm_2$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 6.8506, df = 7, p-value = 0.4446
##
## Model df: 0. Total lags used: 7
The R^2 value increased a little bit by almost 0.03. It isn’t a high increase as our model still only able to explain 24.8% of the data, but it is an improvement. We see that the employment rate has a positive coefficient in our model, showing us that increase in employment increases the sold houses.
Another addition is the search rates for the word “Kira”. The data and the plot for the weekly searches as follows from the Google trends data.
Kira_Search <- data.table(Dates = as.Date(Kira$Week), Kiras = as.numeric(Kira$`kira: (Türkiye)`))
head(Kira_Search,18)
str(Kira)
## spc_tbl_ [158 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Week : Date[1:158], format: "2019-12-29" "2020-01-05" ...
## $ kira: (Türkiye): num [1:158] 30 28 25 21 23 27 23 25 23 45 ...
## - attr(*, "spec")=
## .. cols(
## .. Week = col_date(format = "%Y-%m-%d"),
## .. `kira: (Türkiye)` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Kira$Searches <- ts(Kira[,2], start = 2020, frequency = 12)
ggplot(Kira, aes(x = Kira$Week, y = Kira$`kira: (Türkiye)`)) + geom_line()
As this is a weekly data, we aggregate it before adding to our data frame. The aggregated data and plot is as follows.
Search_bymonth <- aggregate(Kira$Searches~ month(Week) + year(Week), data = Kira, FUN = sum)
Search_bymonth
series_2$Kira_Searches <- Search_bymonth[2:38,3]
ggplot(series_2, aes(x = Dates, y = Kira_Searches)) + geom_line()
Now let’s check for the correlation.
ggpairs(series_2)
We see that the correlation between the search rates for rent and houses sold isn’t as strong as we hoped to. Now let’s add it to our model to see if it works.
lm_2 <- lm(Houses ~ months + House_for_Sale + Employment_Rate + Kira_Searches, series_2 )
summary(lm_2)
##
## Call:
## lm(formula = Houses ~ months + House_for_Sale + Employment_Rate +
## Kira_Searches, data = series_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74483 -21933 -2384 20379 85793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -214180.9 144035.1 -1.487 0.1468
## months 4069.5 1935.6 2.102 0.0435 *
## House_for_Sale 262.8 129.9 2.023 0.0515 .
## Employment_Rate 6624.9 3237.8 2.046 0.0490 *
## Kira_Searches -211.8 141.2 -1.500 0.1435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36830 on 32 degrees of freedom
## Multiple R-squared: 0.3559, Adjusted R-squared: 0.2754
## F-statistic: 4.421 on 4 and 32 DF, p-value: 0.005868
checkresiduals(lm_2$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 6.4162, df = 7, p-value = 0.4921
##
## Model df: 0. Total lags used: 7
Addition of the variable increased our R^2 by almost 0.03, which is once again a small improvement.
The variable we added got a negative coefficient, showing that the tendency of people buying houses lowers as they lean more on the rents.
Lastly, the data on the house loan interests applied by the banks are added. The following is the data and the plot:
House_Loan_Int <- data.table(Dates = as.Date(Building_Sold_Turkey$Date), Loan_Int = as.numeric(Housing_Loan_Interest$`Interest on Housing Loans`))
House_Loan_Int
str(Housing_Loan_Interest)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Interest on Housing Loans: num [1:37] 11.9 11.4 11.4 11.7 11.2 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Interest on Housing Loans` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Housing_Loan_Interest$Int <- ts(Housing_Loan_Interest[,2], start = 2020, frequency = 12)
ggplot(Housing_Loan_Interest, aes(x = Date, y = Housing_Loan_Interest$`Interest on Housing Loans`)) + geom_line()
Now let’s check the correlations.
series_2$Housing_Int <- Housing_Loan_Interest[,2]
series_2
ggpairs(series_2)
The correlation seems to be weak again. Let’s add this variable to see how it performs.
lm_2 <- lm(Houses ~ months + House_for_Sale + Employment_Rate + Kira_Searches + Housing_Int, series_2 )
summary(lm_2)
##
## Call:
## lm(formula = Houses ~ months + House_for_Sale + Employment_Rate +
## Kira_Searches + Housing_Int, data = series_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59226 -18657 356 17856 68765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -312712.50 130301.00 -2.400 0.022593 *
## months 5136.54 1734.07 2.962 0.005820 **
## House_for_Sale -34.91 146.92 -0.238 0.813769
## Employment_Rate 12672.69 3410.30 3.716 0.000799 ***
## Kira_Searches -25.31 137.03 -0.185 0.854680
## Housing_Int -9565.41 2968.86 -3.222 0.002989 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32390 on 31 degrees of freedom
## Multiple R-squared: 0.5175, Adjusted R-squared: 0.4397
## F-statistic: 6.65 on 5 and 31 DF, p-value: 0.000259
checkresiduals(lm_2$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 7.7784, df = 7, p-value = 0.3525
##
## Model df: 0. Total lags used: 7
Addition of the interest rates applied to housing loans improved our model significantly as the R^2 increased by 0.17. In the final status, the model that we created can explain only the 43.97% of the actual data, which is actually a relatively low fit. We can also say that the increase on housing loan interest will be decreasing the sales of houses as the variable has a negative coefficient.
Moving on to the residual analysis, the ACF almost corresponds to the white noise series with almost all of the lags being within the border. Only the first lag is exceeding the border but with a little difference. We can also say that our residuals looking similar to the normal distribution, which adds to the justifications that our residual may indeed be a white noise series.
Now, let’s compare our model values with the actual values and our residual.
series_2$model <- fitted(lm_2)
series_2$residual <- residuals(lm_2)
series_2
ggplot(series_2, aes(x = model, y = residual)) + geom_point()
ggplot(series_2, aes(x = Houses, y = model)) + geom_point() + geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
From the residual versus the model scatter-plot, we may observe a curve that the residuals are following, which may indicate a correlation that is present and to be further looked upon.
In the model versus the actual value plot, we see that datas are collected around a certain area but doesn’t create a linear formation. This indicates that the correlation between our model and the actual data may be relatively low.
Now let’s see how well our model fit to the actual data on a graph.
ggplot(series_2, aes(x=Dates)) +
geom_line(aes(y = Houses, color = "orange")) +
geom_line(aes(y = model, color = "blue"))
We can see from the plots that our model picked up some of the biggest aspects of the actual value, such as the peaks and the drops in the houses sold. But for the year 2022, our model couldn’t fit the values to catch the drop in sales and the sudden increases later on. This indicates that perhaps more seasonal variables could be added to the model so that the peaks for 2022 could be explained better.
Before continuing with the final dependent variable of Confidence Trust index, let’s bring the data and the corresponding plot.
Confidence_Indexes
ggplot(Confidence_Index, aes(x = Date, y = confidence_ind)) + xlab("Date") + ylab("Confidence Index (%)") + ggtitle("Monthly Consumer Confidence Index") + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
As the consumer confidence index is about how consumer is feeling about their financial power/stability and the current situations on the economy, data related to those would be helpful in explaining and linearizing the series. The datas I used for this model are USD Exchange Rate, Consumer Price Index, Total Credit and Bank Card Expenditures, Employment Rates and Google Trends datas for “Zam”. As we are interested in the financial status of the consumers, exchange rates may be related for consumers to check whether their financial status are going up or down with respect to the exchange rate trends. Consumer Price Index may also be related since the status of the market and the changes in prices will effect where consumers see themselves financially and how willing they are on spending. Total credit and bank card expenditures will be related to the consumer confidence index as well as it shows an example on the patterns of consumer spendings. Employement rates once again may be related in this subject as it will be effecting amoun of people with an income and thus the consumer confidence on their financial status. Lastly, search rate for price raises may be related on both the side of employees getting a raise and changing their confidences or on the side of the market price increases, which in this case questions consumers own status with respect to the future expectations on the price increases.
First data that I looked at was the exchange rates. The data and the plot are as follows:
Exchanges <- data.table(Dates = as.Date(Confidence_Index$Date), Exch_rate = as.numeric(Monthly_USD_Exchange_Rates$`Exchange Rate`))
Exchanges
str(Monthly_USD_Exchange_Rates)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Exchange Rate: num [1:37] 5.92 6.04 6.31 6.82 6.95 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Exchange Rate` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Monthly_USD_Exchange_Rates$Exchange <- ts(Monthly_USD_Exchange_Rates[,2], start = 2020, frequency = 12)
ggplot(Monthly_USD_Exchange_Rates, aes(x = Date, y = Exchange)) + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Now let’s create the final series data frame and look for a correlation.
series_3 <- data.table(Dates = as.Date(Confidence_Index$Date), Confidence = as.numeric(Confidence_Index$Confidence), Exchange = as.numeric(Monthly_USD_Exchange_Rates$`Exchange Rate`))
series_3
ggpairs(series_3)
We can see that there is a strong correlation between the confidence index and the exchange rates. Now, let’s initiate our model with the predictor variable of exchange rates.
lm_3 <- lm(Confidence ~ Exchange, series_3 )
summary(lm_3)
##
## Call:
## lm(formula = Confidence ~ Exchange, data = series_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6365 -2.2529 0.2621 2.2280 8.5891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.5769 1.7189 50.37 < 2e-16 ***
## Exchange -0.8565 0.1444 -5.93 9.54e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.977 on 35 degrees of freedom
## Multiple R-squared: 0.5012, Adjusted R-squared: 0.4869
## F-statistic: 35.17 on 1 and 35 DF, p-value: 9.541e-07
checkresiduals(lm_3$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 31.4, df = 7, p-value = 5.246e-05
##
## Model df: 0. Total lags used: 7
We can see that with the addition of the exchange rates, the initial R^2 value turned out to be 0.4869, which is a good start for the model. By the coefficient of the exchange rates took, we can deduce that an increase in the exchange rate lowers the consumers confidence index. This is an expected result since the increase of exchange lowers the tendency of consumers to purchase goods in the market and overall lowers their confidence in their financial status.
Next data to inspect is the consumer price index. The data and the plot is as follows:
Tufe <- data.table(Dates = as.Date(Confidence_Index$Date), TUFE = as.numeric(TUFE_data$TUFE))
Employ_Rate
str(TUFE_data)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date: Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ TUFE: num [1:37] 446 448 451 454 461 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. TUFE = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
TUFE_data$CPI <- ts(TUFE_data[,2], start = 2020, frequency = 12)
ggplot(TUFE_data, aes(x = Date, y = TUFE)) + geom_line()
Now let’s check the correlations.
series_3$CPI <- TUFE_data[,2]
series_3
ggpairs(series_3)
We can see that there is a storng correlation between the Consumer confidence index and the consumer price index. Now, let’s add this data to our model.
lm_3 <- lm(Confidence ~ Exchange+CPI, series_3 )
summary(lm_3)
##
## Call:
## lm(formula = Confidence ~ Exchange + CPI, data = series_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.4833 -2.2207 0.6843 2.1121 5.3642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.68800 2.10215 38.384 < 2e-16 ***
## Exchange -3.94815 0.80709 -4.892 2.37e-05 ***
## CPI 0.05863 0.01513 3.875 0.000462 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.361 on 34 degrees of freedom
## Multiple R-squared: 0.654, Adjusted R-squared: 0.6337
## F-statistic: 32.13 on 2 and 34 DF, p-value: 1.459e-08
checkresiduals(lm_3$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 20.768, df = 7, p-value = 0.00413
##
## Model df: 0. Total lags used: 7
By the model, we can see that consumer price index was helpful on explaining the consumer confidence index. Addition of the data increased the R^2 by 0.15. With the positive coefficient of the price index, we can say that increase in the price index increased the consumer confidence index within this interval.
The third data to inspect is the total expenditures from debit and credit cards. The data is as follows:
Card_Expend <- data.table(Dates = as.Date(Confidence_Index$Date), Expenditures = as.numeric(CD_Card_Expenditure$`Expenditures (Thousand TL)`))
Card_Expend
str(CD_Card_Expenditure)
## spc_tbl_ [37 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Expenditures (Thousand TL): num [1:37] 96609241 77277941 74989625 55814465 80155346 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Expenditures (Thousand TL)` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
CD_Card_Expenditure$Expen <- ts(CD_Card_Expenditure[,2], start = 2020, frequency = 12)
ggplot(CD_Card_Expenditure, aes(x = Date, y = `Expenditures (Thousand TL)` )) + geom_line()
Let’s check the correlation
series_3$Expenditure <- CD_Card_Expenditure[,2]
series_3
ggpairs(series_3)
We can see that there is once again a strong correlation between the expenditures and the consumer confidence index. Now let’s add this data to the model.
lm_3 <- lm(Confidence ~ Exchange + CPI + Expenditure, series_3 )
summary(lm_3)
##
## Call:
## lm(formula = Confidence ~ Exchange + CPI + Expenditure, data = series_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.4478 -1.8510 0.3513 2.0107 5.7151
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.266e+01 2.851e+00 28.997 < 2e-16 ***
## Exchange -3.901e+00 8.078e-01 -4.829 3.05e-05 ***
## CPI 5.051e-02 1.707e-02 2.958 0.00568 **
## Expenditure 1.658e-08 1.619e-08 1.024 0.31339
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.359 on 33 degrees of freedom
## Multiple R-squared: 0.6647, Adjusted R-squared: 0.6342
## F-statistic: 21.8 on 3 and 33 DF, p-value: 5.739e-08
checkresiduals(lm_3$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 16.296, df = 7, p-value = 0.02255
##
## Model df: 0. Total lags used: 7
Although it had a strong correlation between the addition of the variable didn’t change the R^2 significantly as there is only a 0.0005 change. We can say that the increase in expenditure results in the consumer confidence increase, which may be explained by saying if the consumers confidence in their financial status is high they tend to increase their credit and debit card expenses.
The fourth data I chose is the employment rates. The data and the plot is as follows:
Employ_Rate <- data.table(Dates = as.Date(Confidence_Index$Date), Empl_R = as.numeric(Employement_Rates$`Employement Rate`))
Employ_Rate
str(Employement_Rates)
## spc_tbl_ [37 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : Date[1:37], format: "2020-01-01" "2020-02-01" ...
## $ Employement Rate: num [1:37] 43.5 43.6 41.9 40.3 41 43 43 43.8 44 43.6 ...
## $ Rates : Time-Series [1:37, 1] from 2020 to 2023: 43.5 43.6 41.9 40.3 41 43 43 43.8 44 43.6 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr "Employement Rate"
## - attr(*, "spec")=
## .. cols(
## .. Date = col_date(format = "%Y-%m"),
## .. `Employement Rate` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
Employement_Rates$Employment <- ts(Employement_Rates[,2], start = 2020, frequency = 12)
ggplot(Employement_Rates, aes(x = Date, y = Employment )) + geom_line()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Now, let’s check for the correlations.
series_3$Employment <- Employement_Rates[,2]
series_3
ggpairs(series_3)
As we can see, there is a strong correlation between the consumer confidence index and the employment rates. Let’s add this data to our model.
lm_3 <- lm(Confidence ~ Exchange + CPI + Expenditure + Employment, series_3 )
summary(lm_3)
##
## Call:
## lm(formula = Confidence ~ Exchange + CPI + Expenditure + Employment,
## data = series_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5210 -1.7974 0.4223 1.6544 5.9199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.064e+02 1.910e+01 5.568 3.80e-06 ***
## Exchange -3.671e+00 8.216e-01 -4.468 9.26e-05 ***
## CPI 4.800e-02 1.704e-02 2.816 0.00825 **
## Expenditure 2.327e-08 1.692e-08 1.376 0.17848
## Employment -5.698e-01 4.542e-01 -1.255 0.21872
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.33 on 32 degrees of freedom
## Multiple R-squared: 0.6804, Adjusted R-squared: 0.6404
## F-statistic: 17.03 on 4 and 32 DF, p-value: 1.41e-07
checkresiduals(lm_3$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 13.899, df = 7, p-value = 0.05301
##
## Model df: 0. Total lags used: 7
After adding the employment rate, we see a 0.006 increase on the R^2, which is a slight change but still helping us to explain our dependent variable in a way. By inspecting the coefficient, we can say that increase in employment rates in this period and for our model will be decreasing the consumer confidence index.
The last data I chose for this model is the search rate of “Zam” from Google trends. The data and the plot is as follows:
Zam_s <- data.table(Dates = as.Date(Zam$Week), Zam_search = as.numeric(Zam$`zam: (Türkiye)`))
head(Zam_s,18)
Zam$Searches <- ts(Zam[,2], start = 2020, frequency = 12)
ggplot(Zam, aes(x = Zam$Week, y = Zam$`zam: (Türkiye)`)) + geom_line()
Since Google trends is weekly, we continue with aggregating the data to show monthly measures.
bymonth <- aggregate(Zam$`zam: (Türkiye)`~ month(Week) + year(Week), data = Zam, FUN = sum)
bymonth
series_3$Zam_Search <- bymonth[2:38,3]
ggplot(series_3, aes(x = Dates, y = Zam_Search)) + geom_line()
Later we continue with the process of adding the data to our series data frame and looking for the correlation.
series_3
ggpairs(series_3)
From the correlation graph, we can see that there is a strong correlation for searches in price rise and the consumer confidence index. Let’s add it to our model to see the effectiveness on it.
lm_3 <- lm(Confidence ~ Exchange + CPI + Expenditure + Employment + Zam_Search, series_3 )
summary(lm_3)
##
## Call:
## lm(formula = Confidence ~ Exchange + CPI + Expenditure + Employment +
## Zam_Search, data = series_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5885 -1.5589 -0.1387 1.7918 5.1646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.009e+02 1.699e+01 5.940 1.46e-06 ***
## Exchange -2.229e+00 8.602e-01 -2.591 0.01445 *
## CPI 3.192e-02 1.593e-02 2.004 0.05393 .
## Expenditure 7.533e-09 1.579e-08 0.477 0.63667
## Employment -4.401e-01 4.041e-01 -1.089 0.28456
## Zam_Search -4.153e-02 1.323e-02 -3.138 0.00371 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.947 on 31 degrees of freedom
## Multiple R-squared: 0.7574, Adjusted R-squared: 0.7183
## F-statistic: 19.36 on 5 and 31 DF, p-value: 1.02e-08
checkresiduals(lm_3$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 4.1727, df = 7, p-value = 0.7597
##
## Model df: 0. Total lags used: 7
We can see from the summary that R^2 increase significantly, almost 0.08. This means that addition of the final data improved our model. Looking at the coefficient, we can deduce that increase of the search of “Zam” reduces the Consumer Confidence Index, which is understandable knowing that the increase in prices will negatively effect the confidence on ones current financial status.
From the residual analysis, we can see that it resembles a white noise series with ACF being within the range. Also the distribution of the residuals resembles the normal distribution graph, which indicates that the residuals indeed line up with the white noise series assumptions.
From our final model with R^2 = 0.7183, we can say that our model and the actual values should be fitting by 71.83%, meaning that we can explain 71.83% of the data with the model we created.
Now let’s check our model and residuals versus the actual values.
series_3$model <- fitted(lm_3)
series_3$residual <- residuals(lm_3)
series_3
ggplot(series_3, aes(x = model, y = residual)) + geom_point()
ggplot(series_3, aes(x = Confidence, y = model)) + geom_point() + geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
We can see from the residual vs model plot, there seems to be no trend or a pattern to how residuals are scattered. This means that our residuals are good for our analysis.
When we check the model vs. actual value graph, a tendency to gather around a linear slope can be seen but still have values away from the intended range. This means that there may still be some improvements made on our model but we found a correlation between our model and the actual values.
Finally, let’s check how well the model and the actual data fit on each other.
ggplot(series_3, aes(x=Dates)) +
geom_line(aes(y = model, color = "blue")) +
geom_line(aes(y = Confidence, color = "orange"))
Looking at the graphs, we can see that some of the important aspects were captured with the model we made. The spike at the beginning of 2021, decrease in the confidence afterwards, slight increase at the beginning of 2022, sudden decrease and later the changing increase can all be observed via our model.
##Conclusion
Overall, three models were created to conduct time series regression on three topics of our choosing. The models were based on the monthly datas of Official USD Reserve Assets, Total House Sales in Turkey and the Consumer Confidence Index from January 2020 to January 2023.
Among these three models, the most successfull one was the Consumer Confidence Index model, which was able to fit by 71.83% to the original values. Next is the Official USD Reserve Assets model fitting by 55.51% to the original data and lastly and the weakest model is the Total House Sales model with only 43.97% fit.
Although none of these models were a 100% fit and could linearize the inspected series perfectly, they were all successful on integrating some of the key aspects of the original datas and can be improved even further to get closer to the 100% accuracy.
In all of these data, selecting the related topics to the investigated topic and adding the valuable ones to the model was highly important as in search for the correct datas, the model can also penalize for adding unimportant variables.
Also the other important factor was to observe the graphs correctly and separate the trend and seasonality variables accurately as they may influence the correlation of the residuals in time.